Santa Clara County: What Do We Know?

MDSH Capstone Project

Author

Dr. Hua Zhou and Team #6

Published

April 10, 2025

1 Introduction

Let us use various data science tools in R (tidyverse, tidycensus) to explore the demographics of the Santa Clara County. This dynamic document was produced by Quarto, which supports R, Python, and Julia.

Code
scc_acs_2020 <- get_acs(
  geography = "tract",
  county = c("Santa Clara"),
  state = "CA",
  variable = c(total_pop = "B01003_001"),
  year = 2020,
  geometry = TRUE,
  output = "wide"
) %>%
  st_transform(4326)
Getting data from the 2016-2020 5-year ACS

There are 408 census tracts in Santa Clara County. On average each census tract contains 4000 residents. These define our communities and we want to visualize health, income, education, housing, and health disparities in these communities.

Code
mapview(scc_acs_2020, legend = FALSE)

2 Population and people

Code
# 2020 Decennial Census data
scc_dec_2020 <- get_decennial(
  geography = "county",
  state = "CA",
  county = c("Santa Clara"),
  variables = c(
    total_pop = "P2_001N"
  ),
  year = 2020
)
Getting data from the 2020 decennial Census
Using the PL 94-171 Redistricting Data Summary File
Note: 2020 decennial Census data use differential privacy, a technique that
introduces errors into data to preserve respondent confidentiality.
ℹ Small counts should be interpreted with caution.
ℹ See https://www.census.gov/library/fact-sheets/2021/protecting-the-confidentiality-of-the-2020-census-redistricting-data.html for additional guidance.
This message is displayed once per session.
Code
# National population size from 2020 Decennial Census.
uspop20 <- get_decennial(
  geography = "state", 
  variables = "P2_001N",
  year = 2020
)
Getting data from the 2020 decennial Census
Using the PL 94-171 Redistricting Data Summary File

Santa Clara County has 1936259 residents, about 1% of the US population (2020 Decennial Census).

2.1 Age and sex

Code
# 2021 1-year ACS data for SC County
scc_acs1_2021 <- get_acs(
  geography = "county",
  county = c("Santa Clara"),
  state = "CA",
  year = 2021,
  variables = c(med_age = "B01002_001"), 
  survey = "acs1"
)
Getting data from the 2021 1-year ACS
The 1-year ACS provides data for geographies with populations of 65,000 and greater.

The median age is 38.2 in Santa Clara County (2021 ACS).

Population by Age and Sex (Pyramid Plot):

Code
# ingest
scc_pyramid <- get_estimates(
  geography = "county",
  county = c("Santa Clara"),
  state = "CA",
  product = "characteristics",
  breakdown = c("SEX", "AGEGROUP"),
  breakdown_labels = TRUE,
  year = 2019
) %>% 
  # wrangle
  filter(
    str_detect(AGEGROUP, "^Age"),
    SEX != "Both sexes"
    ) %>%
  mutate(value = ifelse(SEX == "Male", -value, value)) %>%
  # visualize
  ggplot(aes(x = value, y = AGEGROUP, fill = SEX)) +
  geom_col(width = 0.95, alpha = 0.75) +
  theme_minimal(base_family = "Verdana", base_size = 12) +
  scale_x_continuous(
    labels = ~ number_format(scale = .001, suffix = "k")(abs(.x)),
    limits = 1000000 * c(-0.1, 0.1)
  ) +
  scale_y_discrete(labels = ~ str_remove_all(.x, "Age\\s|\\syears")) +
  scale_fill_manual(values = c("darkred", "navy")) +
  labs(x = "",
       y = "2019 ACS estimate",
       title = "Population structure in Santa Clara County",
       fill = "",
       caption = "Data source: US Census Bureau population estimates")

ggplotly(scc_pyramid)
Code
# ingest
senior_prop <- get_estimates(
    geography = "county",
    county = c("Santa Clara"),
    state = "CA",
    product = "characteristics",
    breakdown = c("SEX", "AGEGROUP"),
    breakdown_labels = TRUE,
    year = 2019
) %>% 
  # wrangle
  filter(
    str_detect(AGEGROUP, "^Age"),
    SEX != "Both sexes"
) %>% 
  mutate(AGEGROUP = as.ordered(AGEGROUP)) %>%
  mutate(senior = AGEGROUP >= "Age 65 to 69 years") %>%
  summarise(seniorprop = sum(senior * value) / sum(value))

13.9% of Santa Clara residents are 65 years and older(2014-2019 ACS).

2.2 Race and ethnicity

Code
# Count of races in each tract from the 2020 decennial census
scc_race <- get_decennial(
  geography = "tract",
  state = "CA",
  county = c("Santa Clara"),
  variables = c(
    Hispanic = "P2_002N",
    White = "P2_005N",
    Black = "P2_006N",
    Native = "P2_007N",
    Asian = "P2_008N"
  ),
  summary_var = "P2_001N",
  year = 2020,
  geometry = TRUE
) %>%
  mutate(percent = 100 * (value / summary_value))
Getting data from the 2020 decennial Census
Using the PL 94-171 Redistricting Data Summary File

2.2.1 Faceted maps

Percentages of Asian, Black, Hispanic, Native, and White populations in each census tract in Santa Clara County.

Code
scc_race %>%
  drop_na() %>%
  tm_shape() +
  tm_facets(by = "variable", scale.factor = 4) +
  tm_fill(col = "percent",
          style = "quantile",
          n = 6,
          palette = "Blues",
          title = "Percent (2020 US Census)")# +
── tmap v3 code detected ───────────────────────────────────────────────────────
[v3->v4] `tm_polygons()`: instead of `style = "quantile"`, use fill.scale =
`tm_scale_intervals()`.
ℹ Migrate the argument(s) 'style', 'n', 'palette' (rename to 'values') to
  'tm_scale_intervals(<HERE>)'
[v3->v4] `tm_polygons()`: use 'fill' for the fill color of polygons/symbols
(instead of 'col'), and 'col' for the outlines (instead of 'border.col').
[v3->v4] `tm_polygons()`: migrate the argument(s) related to the legend of the
visual variable `fill` namely 'title' to 'fill.legend = tm_legend(<HERE>)'
[cols4all] color palettes: use palettes from the R package cols4all. Run
`cols4all::c4a_gui()` to explore them. The old palette name "Blues" is named
"brewer.blues"
Multiple palettes called "blues" found: "brewer.blues", "matplotlib.blues". The first one, "brewer.blues", is returned.

Code
  # tm_layout(bg.color = "grey",
  #           legend.position = c(-0.7, 0.15),
  #           panel.label.bg.color = "white")

2.2.2 Dot-density map

Code
scc_dots <- scc_race %>%
  drop_na() %>%
  as_dot_density(
    value = "value",
    values_per_dot = 50,
    group = "variable"
  )

background_tracts <- scc_race %>%
  drop_na() %>%
  filter(variable == "White")

tm_shape(background_tracts) +
  tm_polygons(col = "white",
              border.col = "grey") +
  tm_shape(scc_dots) +
  tm_dots(col = "variable",
          palette = "Set1",
          size = 0.005,
          title = "1 dot = 50 people") +
  tm_layout(legend.outside = TRUE,
            title = "Race/Ethnicity\n2020 Census")

2.3 Segregation and diversity

The following table tallies the segregation indices \(H\) in major urban areas in California with population > 750,000. Higher \(H\) indicates more segregation.

Code
library(segregation)

# Get California tract data by race/ethnicity
ca_acs_data <- get_acs(
  geography = "tract",
  variables = c(
    white = "B03002_003",
    black = "B03002_004",
    asian = "B03002_006",
    hispanic = "B03002_012"
  ), 
  state = "CA",
  geometry = TRUE,
  year = 2019
) 
Getting data from the 2015-2019 5-year ACS
Code
# Use tidycensus to get urbanized areas by population with geometry, 
# then filter for those that have populations of 750,000 or more
us_urban_areas <- get_acs(
  geography = "urban area",
  variables = "B01001_001",
  geometry = TRUE,
  year = 2019,
  survey = "acs1"
) %>%
  filter(estimate >= 750000) %>%
  transmute(urban_name = str_remove(NAME, 
                                    fixed(", CA Urbanized Area (2010)")))
Getting data from the 2019 1-year ACS
The 1-year ACS provides data for geographies with populations of 65,000 and greater.
Code
# Compute an inner spatial join between the California tracts and the 
# urbanized areas, returning tracts in the largest California urban 
# areas with the urban_name column appended
ca_urban_data <- ca_acs_data %>%
  st_join(us_urban_areas, left = FALSE) %>%
  select(-NAME) %>%
  st_drop_geometry()

mutual_within(
  data = ca_urban_data,
  group = "variable",
  unit = "GEOID",
  weight = "estimate",
  within = "urban_name",
  wide = TRUE
) %>%
  select(urban_name, H) %>%
  arrange(desc(H))
Key: <urban_name>
                         urban_name         H
                             <char>     <num>
1: Los Angeles--Long Beach--Anaheim 0.2851662
2:           San Francisco--Oakland 0.2116127
3:                        San Diego 0.2025728
4:                         San Jose 0.1829190
5:                       Sacramento 0.1426804
6:        Riverside--San Bernardino 0.1408461

2.3.1 Local segregation analysis

Patterns of segregation across the San Francisco–Oakland urban area:

Code
sf_local_seg <- ca_urban_data %>%
  filter(urban_name == "San Francisco--Oakland") %>%
  mutual_local(
    group = "variable",
    unit = "GEOID",
    weight = "estimate", 
    wide = TRUE
  )

sf_tracts_seg <- tracts("CA", cb = TRUE, year = 2019) %>%
  inner_join(sf_local_seg, by = "GEOID") 

sf_tracts_seg %>%
  ggplot(aes(fill = ls)) + 
  geom_sf(color = NA) + 
  coord_sf(crs = 26946) + 
  scale_fill_viridis_c(option = "inferno") + 
  theme_void() + 
  labs(fill = "Local\nsegregation index")

2.4 Language spoken at home

Code
scc_lang <- get_acs(
  geography = "county",
  county = c("Santa Clara"),
  state = "CA",
  year = 2021,
  variables = c(
    English = "DP02_0113",
    Spanish = "DP02_0116",
    Indo_European = "DP02_0118",
    Asian = "DP02_0120",
    Other = "DP02_0122"
  ),
  summary_var = "DP02_0112",
  survey = "acs5"
  ) %>%
  mutate(percent = estimate / summary_est)
Getting data from the 2017-2021 5-year ACS
Using the ACS Data Profile

53.7% of Santa Clara County residents speak a language other than English at home (2016-2021 ACS).

Code
scc_lang_plot <- scc_lang %>%
  ggplot(aes(x = fct_rev(fct_reorder(variable, percent)), y = percent)) + 
  geom_col(color = "navy", fill = "navy", 
           alpha = 0.5, width = 0.4) + 
  scale_y_continuous(labels = label_percent(scale = 100)) +
  labs(
    title = "Languages spoken at home in Santa Clara County",
    subtitle = "2016-2021 ACS, population 5 years and over",
    x = "Language",
    y = "Percent"
  )

ggplotly(scc_lang_plot)

2.5 Native and foreign born

Code
scc_foreignborn <- get_acs(
  geography = "county",
  county = c("Santa Clara"),
  state = "CA",
  year = 2021,
  variables = c(
    Total = "DP02_0105",
    Europe = "DP02_0106",
    Asia = "DP02_0107",
    Africa = "DP02_0108",
    Oceania = "DP02_0109",
    Latin_America = "DP02_0110",
    North_American = "DP02_0111"
  ),
  summary_var = "B01003_001",
  survey = "acs5"
  ) %>%
  mutate(percent = estimate / summary_est)
Getting data from the 2017-2021 5-year ACS
Using the ACS Data Profile

39.9% of Santa Clara County residents are foreign-born (2016-2021 ACS).

Code
scc_foreignborn_plot <- scc_foreignborn %>%
  filter(variable != "Total") %>%
  ggplot(aes(x = fct_rev(fct_reorder(variable, percent)), y = percent)) + 
  geom_col(color = "navy", fill = "navy", 
           alpha = 0.5, width = 0.5) + 
  scale_y_continuous(labels = label_percent(scale = 100)) +
  labs(
    title = "Foreign born population in LA County",
    subtitle = "2016-2021 ACS",
    x = "Continent",
    y = "Percent"
  )

ggplotly(scc_foreignborn_plot)

3 Employment

Code
# 2021 1-year ACS data for SC County
acs1_variables <- c(
    total_population = "B01003_001",
    med_age = "B01002_001",
    med_house_val = "B25077_001",
    med_rooms = "B25018_001",
    med_year_built = "B25037_001",
    med_income = "DP03_0062",
    pct_college = "DP02_0068P",
    pct_foreign_born = "DP02_0094P",
    pct_insured = "DP03_0096P",
    pct_ooh = "DP04_0046P",
    pct_white = "DP05_0077P",
    poverty_denom = "B17010_001",
    poverty_num = "B17010_002",
    pop_16above = "DP03_0001",
    emp_16above = "DP03_0004"
    )

scc_acs1_2021 <- get_acs(
  geography = "county",
  county = c("Santa Clara"),
  state = "CA",
  year = 2021,
  variables = c(
    pop_16above = "DP03_0001",
    emp_16above = "DP03_0004"
  ), 
  survey = "acs1"
  )
Getting data from the 2021 1-year ACS
The 1-year ACS provides data for geographies with populations of 65,000 and greater.
Using the ACS Data Profile

The employment rate in population 16 years and over in LA County is 62.5% (2021 ACS).

4 Income and poverty

Code
scc_acs1_2021 <- get_acs(
  geography = "county",
  county = c("Santa Clara"),
  state = "CA",
  year = 2021,
  variables = c(
    med_income = "DP03_0062"
  ), 
  survey = "acs1"
  )
Getting data from the 2021 1-year ACS
The 1-year ACS provides data for geographies with populations of 65,000 and greater.
Using the ACS Data Profile

The median household income in LA County is $141562 (2021 ACS).

Code
scc_poverty <- get_acs(
  geography = "county",
  state = "CA",
  county = "Santa Clara",
  year = 2021,
  survey = "acs1",
  variables = c(
    total_poverty_0_99 = "B06012_002",
    total_poverty_100_149 = "B06012_003",
    total_poverty_150_above = "B06012_004"
  )
)
Getting data from the 2021 1-year ACS
The 1-year ACS provides data for geographies with populations of 65,000 and greater.

6.8% percentage of Santa Clara County residents live under the poverty line (2021 ACS)!

Poverty struck neighborhoods:

Code
library(tidycensus)
library(ggiraph)
library(tidyverse)
library(patchwork)
library(scales)

scc_medincome <- get_acs(
  geography = "tract",
  variables = c(med_income = "DP03_0062"),
  county = c("Santa Clara"),
  state = "CA",
  year = 2021,
  survey = "acs5",
  geometry = TRUE,
  output = "wide"
)
Getting data from the 2017-2021 5-year ACS
Using the ACS Data Profile
Code
medinc_top10 <- scc_medincome %>%
  slice_min(med_incomeE, n = 10) %>%
  mutate(NAME = str_remove(NAME, ", Santa Clara County, California")) %>%
  mutate(NAME = str_remove(NAME, "Census"))

# sccmetro_map <- ggplot() +
#   geom_sf(data = county_subdivisions(state = "CA", county = "Santa Clara", cb = TRUE)) +
#   geom_sf_interactive(
#     data = medinc_top10, 
#     mapping = aes(fill = med_incomeE, data_id = GEOID)
#     ) +
#   scale_fill_distiller(
#     palette = "Greens",
#     direction = 1,
#     guide = "none"
#     ) +
#   theme_void()

ggplot(
  data = medinc_top10, 
  mapping = aes(x = med_incomeE, y = reorder(NAME, med_incomeE), fill = med_incomeE)
  ) +
  geom_errorbar(aes(xmin = med_incomeE - med_incomeM, xmax = med_incomeE + med_incomeM)) +
  geom_point_interactive(
    color = "black", 
    size = 4, 
    shape = 21,
    aes(data_id = GEOID)
    ) +
  scale_fill_distiller(
    palette = "Greens", 
    direction = 1,
    labels = label_dollar()
    ) +
  scale_x_continuous(labels = label_dollar()) +
  labs(title = "Household income by census tract in Santa Clara County",
       subtitle = "2016-2021 American Community Survey",
       y = "",
       x = "ACS estimate (bars represent margin of error)",
       fill = "ACS estimate") +
  theme_minimal(base_size = 14)

Code
# girafe(ggobj = sccmetro_map + sccmetro_plot, width_svg = 10, height_svg = 5) %>%
#   girafe_options(opts_hover(css = "fill:red;"))

5 Education

Code
scc_edu <- get_acs(
  geography = "county", 
  state = "CA", 
  county = "Santa Clara",
  year = 2020,
  variables = c(
    less_hs = "B06009_002",
    hs = "B06009_003",
    as = "B06009_004",
    col = "B06009_005",
    grad = "B06009_006"
    ),
  summary_var = "B06009_001"
)
Getting data from the 2016-2020 5-year ACS

53.5% of the population aged 25 and up in Santa Clara County have a bachelor’s degree or higher (2016-2020 ACS).

Code
scc_pctcollege <- get_acs(
  geography = "tract",
  variables = c(pct_college = "DP02_0068P"),
  county = c("Santa Clara"),
  state = "CA",
  year = 2020,
  survey = "acs5",
  geometry = TRUE,
  output = "wide"
)
Getting data from the 2016-2020 5-year ACS
Using the ACS Data Profile
Code
mapview(scc_pctcollege, zcol = "pct_collegeE", layer = "% college")

6 Commuting

Code
scc_commute <- get_acs(
  geography = "county",
  county = "Santa Clara",
  state = "CA",
  variables = c(
    Drove_alone = "DP03_0019P",
    Carpool = "DP03_0020P",
    Public_transportation = "DP03_0021P",
    Walk = "DP03_0022P",
    Other_means = "DP03_0023P",
    Work_from_home = "DP03_0024P",
    Mean_travel_time_to_work_in_minutes = "DP03_0025"
  ),
  year = 2021
)
Getting data from the 2017-2021 5-year ACS
Using the ACS Data Profile
Code
scc_commute_plot <- scc_commute %>%
  filter(!str_detect(variable, "Mean_travel_time")) %>%
  ggplot() + 
  geom_col(
    aes(x = fct_rev(fct_reorder(variable, estimate)), y = estimate / 100),
    color = "navy", 
    fill = "navy", 
    alpha = 0.5,
    width = 0.6
    ) + 
  scale_y_continuous(labels = scales::percent) +
  labs(
    title = "Commuting mode in LA County",
    subtitle = "2016-2021 ACS",
    x = NULL,
    y = "Percent among workers 16 years and over"
  )

ggplotly(scc_commute_plot)

An average Santa Clara worker (16 years or over) spends 28.5 minutes commuting to work.

Code
get_acs(
  geography = "tract",
  variables = c(Commute_Time_in_Minutes = "DP03_0025"),
  county = c("Santa Clara"),
  state = "CA",
  year = 2021,
  survey = "acs5",
  geometry = TRUE,
  output = "wide"
) %>%
  # print() %>%
  mapview(zcol = "Commute_Time_in_MinutesE", layer.name = "Minutes")
Getting data from the 2017-2021 5-year ACS
Using the ACS Data Profile

7 Housing (rent burden)

Code
# PUMA in LA metro
lametro_pumas <- pumas(state = "CA", cb = TRUE, year = 2019) %>%
  filter(str_detect(NAME10, "(Los Angeles County|Orange County)"))

# PUMS variable
hh_variables <- c("PUMA", "GRPIP", "RAC1P",
                  "HISP", "HHT")

# ingest PUMS data
# get_pums(
#   variables = hh_variables,
#   state = "CA",
#   puma = lametro_pumas$PUMACE10,
#   year = 2020,
#   variables_filter = list(
#     SPORDER = 1,
#     TEN = 3
#   ),
#   recode = TRUE
# ) %>%
read_rds("lametro_hh_data.rds") %>%
  # recode
  mutate(
    race_ethnicity = case_when(
      HISP != "01" ~ "Hispanic",
      HISP == "01" & RAC1P == "1" ~ "White",
      HISP == "01" & RAC1P == "2" ~ "Black",
      TRUE ~ "Other"
    ),
    married = case_when(
      HHT == "1" ~ "Married",
      TRUE ~ "Not married"
    )
  ) %>%
  # group-wise summary
  filter(race_ethnicity != "Other") %>%
  group_by(race_ethnicity, married, PUMA) %>%
  summarize(
    prop_above_40 = sum(WGTP[GRPIP >= 40]) / sum(WGTP)
  ) %>%
  # join with PUMA
  left_join(lametro_pumas, by = c("PUMA" = "PUMACE10")) %>%
  # save as rds
  write_rds(file = "lametro_data_for_map.rds") %>%
  print()
Code
library(tmap)

read_rds("lametro_data_for_map.rds") %>%
  st_as_sf() %>%
  tm_shape() +
  tm_facets(by = c("race_ethnicity", "married"), scale.factor = 6) +
  tm_fill(col = "prop_above_40",
          style = "quantile",
          n = 5,
          palette = "Blues",
          title = "Percent households") +
  tm_layout(bg.color = "grey",
            legend.outside = TRUE,
            panel.label.bg.color = "white",
            main.title = "Rent burdened-households in LA County\n2016-2020 ACS estimate (from PUMS data)")

We define a household to be rent-burdened when gross rent is 40 percent or more of household income.

Code
library(survey)
library(srvyr)

read_rds("lametro_hh_replicate.rds") %>%
  to_survey(type = "housing",
            design = "rep_weights") %>%
  filter(TEN == 3) %>%
  mutate(
    race_ethnicity = case_when(
      HISP != "01" ~ "Hispanic",
      HISP == "01" & RAC1P == "1" ~ "White",
      HISP == "01" & RAC1P == "2" ~ "Black",
      TRUE ~ "Other"
    ),
    married = case_when(
      HHT == "1" ~ "Married",
      TRUE ~ "Not married"
    ),
    above_40 = GRPIP >= 40
  ) %>%
  filter(race_ethnicity != "Other") %>%
  group_by(race_ethnicity, married) %>%
  summarize(
    prop_above_40 = survey_mean(above_40)
  ) %>%
  # MOE
  mutate(prop_above_40_moe = prop_above_40_se * 1.645,
         label = paste(race_ethnicity, married, sep = ", "))  %>%
  ggplot(aes(
    x = prop_above_40,
    y = reorder(label, prop_above_40)
    )) +
  geom_errorbar(aes(xmin = prop_above_40 - prop_above_40_moe,
                     xmax = prop_above_40 + prop_above_40_moe)) +
  geom_point(size = 3, color = "navy") +
  labs(title = "Rent burdened-households in LA metro",
       x = "2016-2020 ACS estimate (from PUMS data)",
       y = "",
       caption = "Rent-burdened defined when gross rent is 40 percent or more\nof household income. Error bars represent a 90 percent confidence level.") +
  scale_x_continuous(labels = scales::percent) +
  theme_grey(base_size = 12)

8 Health

8.1 Insurance coverage and disparity

Code
scc_inscov <- get_acs(
  geography = "county",
  variables = c(
    total = "B27011_001",
    in_labor_force_employed_insured = "B27011_004",
    in_labor_force_employed_uninsured = "B27011_007",
    in_labor_force_unemployed_insured = "B27011_009",
    in_labor_force_unemployed_uninsured = "B27011_012",
    in_labor_force_employed_insured = "B27011_004",
    notin_labor_force_uninsured = "B27011_014",
    notin_labor_force_insured = "B27011_017"
    ),
  county = "Santa Clara",
  state = "CA",
  year = 2021,
  survey = "acs1",
  # output = "wide"
)
Getting data from the 2021 1-year ACS
The 1-year ACS provides data for geographies with populations of 65,000 and greater.

5.0% of Santa Clara residents don’t have insurance (2021 ACS).

Code
scc_insured <- get_acs(
  geography = "tract",
  variables = c(pct_insured = "DP03_0096P"),
  county = c("Santa Clara"),
  state = "CA",
  year = 2021,
  geometry = TRUE,
  output = "wide"
)
Getting data from the 2017-2021 5-year ACS
Using the ACS Data Profile
Code
mapview(scc_insured, zcol = "pct_insuredE", layer = "% insured")